Extract the data.
#Set training data and test data
train = read.csv("train.csv", stringsAsFactors = F)
test = read.csv("test.csv", stringsAsFactors = F)
Histogram/Plot and any additional exploratory analysis of the attributes.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
hist(train$MSSubClass)
hist(test$MSSubClass)
boxplot(train$MSSubClass)
boxplot(test$MSSubClass)
train1 <- data.frame(train$MSZoning)
ggplot(train1) + geom_bar(aes(x=train$MSZoning))
test1 <- data.frame(test$MSZoning)
ggplot(test1) + geom_bar(aes(x=test$MSZoning))
hist(train$LotFrontage)
hist(test$LotFrontage)
boxplot(train$LotFrontage)
boxplot(test$LotFrontage)
median(train$LotFrontage, na.rm= TRUE)
## [1] 69
median(test$LotFrontage, na.rm= TRUE)
## [1] 67
hist(train$LotArea, xlim = c(0,55000))
hist(test$LotArea, xlim = c(0,55000))
boxplot(train$LotArea)
boxplot(test$LotArea)
summary(train$LotArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1300 7554 9478 10517 11602 215245
summary(test$LotArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1470 7391 9399 9819 11518 56600
train2 <- data.frame(train$Street)
ggplot(train2) + geom_bar(aes(x=train$Street))
test2 <- data.frame(test$Street)
ggplot(test2) + geom_bar(aes(x=test$Street))
train3 <- data.frame(train$Alley)
ggplot(train3) + geom_bar(aes(x=train$Alley))
test3<- data.frame(test$Alley)
ggplot(test3) + geom_bar(aes(x=test$Alley))
train4 <- data.frame(train$LotShape)
ggplot(train4) + geom_bar(aes(x=train$LotShape))
test4<- data.frame(test$LotShape)
ggplot(test4) + geom_bar(aes(x=test$LotShape))
train5 <- data.frame(train$LandContour)
ggplot(train5) + geom_bar(aes(x=train$LandContour))
test5<- data.frame(test$LandContour)
ggplot(test5) + geom_bar(aes(x=test$LandContour))
train6 <- data.frame(train$Utilities)
ggplot(train6) + geom_bar(aes(x=train$Utilities))
test6<- data.frame(test$Utilities)
ggplot(test6) + geom_bar(aes(x=test$Utilities))
train6 <- data.frame(train$LotConfig)
ggplot(train6) + geom_bar(aes(x=train$LotConfig))
test6<- data.frame(test$LotConfig)
ggplot(test6) + geom_bar(aes(x=test$LotConfig))
train6 <- data.frame(train$LandSlope)
ggplot(train6) + geom_bar(aes(x=train$LandSlope))
test6<- data.frame(test$LandSlope)
ggplot(test6) + geom_bar(aes(x=test$LandSlope))
train6 <- data.frame(train$Neighborhood)
ggplot(train6) + geom_bar(aes(x=train$Neighborhood))
test6<- data.frame(test$Neighborhood)
ggplot(test6) + geom_bar(aes(x=test$Neighborhood))
train6 <- data.frame(train$Condition1)
ggplot(train6) + geom_bar(aes(x=train$Condition1))
test6<- data.frame(test$Condition1)
ggplot(test6) + geom_bar(aes(x=test$Condition1))
train6 <- data.frame(train$Condition2)
ggplot(train6) + geom_bar(aes(x=train$Condition2))
test6<- data.frame(test$Condition2)
ggplot(test6) + geom_bar(aes(x=test$Condition2))
train6 <- data.frame(train$BldgType)
ggplot(train6) + geom_bar(aes(x=train$BldgType))
test6<- data.frame(test$BldgType)
ggplot(test6) + geom_bar(aes(x=test$BldgType))
train6 <- data.frame(train$HouseStyle)
ggplot(train6) + geom_bar(aes(x=train$HouseStyle))
test6<- data.frame(test$HouseStyle)
ggplot(test6) + geom_bar(aes(x=test$HouseStyle))
train6 <- data.frame(train$OverallQual)
ggplot(train6) + geom_bar(aes(x=train$OverallQual))
test6<- data.frame(test$OverallQual)
ggplot(test6) + geom_bar(aes(x=test$OverallQual))
train6 <- data.frame(train$OverallCond)
ggplot(train6) + geom_bar(aes(x=train$OverallCond))
test6<- data.frame(test$OverallCond)
ggplot(test6) + geom_bar(aes(x=test$OverallCond))
hist(train$YearBuilt)
hist(test$YearBuilt)
hist(train$YearRemodAdd)
hist(test$YearRemodAdd)
train6 <- data.frame(train$RoofStyle)
ggplot(train6) + geom_bar(aes(x=train$RoofStyle))
test6<- data.frame(test$RoofStyle)
ggplot(test6) + geom_bar(aes(x=test$RoofStyle))
train6 <- data.frame(train$RoofMatl)
ggplot(train6) + geom_bar(aes(x=train$RoofMatl))
test6<- data.frame(test$RoofMatl)
ggplot(test6) + geom_bar(aes(x=test$RoofMatl))
test6<- data.frame(test$RoofStyle)
ggplot(test6) + geom_bar(aes(x=test$RoofStyle))
test6<- data.frame(train$RoofStyle)
ggplot(test6) + geom_bar(aes(x=train$RoofStyle))
test6<- data.frame(train$Exterior1st)
ggplot(test6) + geom_bar(aes(x=train$Exterior1st))
test6<- data.frame(test$Exterior1st)
ggplot(test6) + geom_bar(aes(x=test$Exterior1st))
test6<- data.frame(train$Exterior2nd)
ggplot(test6) + geom_bar(aes(x=train$Exterior2nd))
test6<- data.frame(test$Exterior1st)
ggplot(test6) + geom_bar(aes(x=test$Exterior2nd))
test6<- data.frame(train$MasVnrType)
ggplot(test6) + geom_bar(aes(x=train$MasVnrType))
test6<- data.frame(test$MasVnrType)
ggplot(test6) + geom_bar(aes(x=test$MasVnrType))
hist(train$MasVnrArea)
hist(test$MasVnrArea)
test6<- data.frame(train$ExterQual)
ggplot(test6) + geom_bar(aes(x=train$ExterQual))
test6<- data.frame(test$ExterQual)
ggplot(test6) + geom_bar(aes(x=test$ExterQual))
test6<- data.frame(train$ExterCond)
ggplot(test6) + geom_bar(aes(x=train$ExterCond))
test6<- data.frame(test$ExterCond)
ggplot(test6) + geom_bar(aes(x=test$ExterCond))
test6<- data.frame(train$Foundation)
ggplot(test6) + geom_bar(aes(x=train$Foundation))
test6<- data.frame(test$ExterCond)
ggplot(test6) + geom_bar(aes(x=test$Foundation))
ggplot(data.frame(train$BsmtQual)) + geom_bar(aes(x=train$BsmtQual))
ggplot(data.frame(test$BsmtQual)) + geom_bar(aes(x=test$BsmtQual))
ggplot(data.frame(train$BsmtCond)) + geom_bar(aes(x=train$BsmtCond))
ggplot(data.frame(test$BsmtCond)) + geom_bar(aes(x=test$BsmtCond))
ggplot(data.frame(train$BsmtExposure)) + geom_bar(aes(x=train$BsmtExposure))
ggplot(data.frame(test$BsmtExposure)) + geom_bar(aes(x=test$BsmtExposure))
ggplot(data.frame(train$BsmtFinType1)) + geom_bar(aes(x=train$BsmtFinType1))
ggplot(data.frame(test$BsmtFinType1)) + geom_bar(aes(x=test$BsmtFinType1))
hist(train$BsmtFinSF1)
hist(test$BsmtFinSF1)
ggplot(data.frame(train$BsmtFinType2)) + geom_bar(aes(x=train$BsmtFinType2))
ggplot(data.frame(test$BsmtFinType2)) + geom_bar(aes(x=test$BsmtFinType2))
hist(train$BsmtFinSF2)
hist(test$BsmtFinSF2)
hist(train$BsmtUnfSF)
hist(test$BsmtUnfSF)
hist(train$TotalBsmtSF)
hist(test$TotalBsmtSF)
ggplot(data.frame(train$Heating)) + geom_bar(aes(x=train$Heating))
ggplot(data.frame(test$Heating)) + geom_bar(aes(x=test$Heating))
ggplot(data.frame(train$HeatingQC)) + geom_bar(aes(x=train$HeatingQC))
ggplot(data.frame(test$HeatingQC)) + geom_bar(aes(x=test$HeatingQC))
ggplot(data.frame(train$CentralAir)) + geom_bar(aes(x=train$CentralAir))
ggplot(data.frame(test$CentralAir)) + geom_bar(aes(x=test$CentralAir))
ggplot(data.frame(train$CentralAir)) + geom_bar(aes(x=train$CentralAir))
ggplot(data.frame(test$CentralAir)) + geom_bar(aes(x=test$CentralAir))
ggplot(data.frame(train$Electrical)) + geom_bar(aes(x=train$Electrical))
ggplot(data.frame(test$Electrical)) + geom_bar(aes(x=test$Electrical))
hist(train$X1stFlrSF)
hist(test$X1stFlrSF)
mean(train$X1stFlrSF)
## [1] 1162.627
mean(test$X1stFlrSF)
## [1] 1156.535
hist(train$X2ndFlrSF)
hist(test$X2ndFlrSF)
mean(train$X2ndFlrSF)
## [1] 346.9925
mean(test$X2ndFlrSF)
## [1] 325.9678
hist(train$LowQualFinSF)
hist(test$LowQualFinSF)
mean(train$LowQualFinSF)
## [1] 5.844521
mean(test$LowQualFinSF)
## [1] 3.543523
hist(train$GrLivArea)
hist(test$GrLivArea)
mean(train$GrLivArea)
## [1] 1515.464
mean(test$GrLivArea)
## [1] 1486.046
hist(train$BsmtFullBath)
hist(test$BsmtFullBath)
hist(train$BsmtHalfBath)
hist(test$BsmtHalfBath)
hist(train$BedroomAbvGr)
hist(test$Bedroom)
hist(train$KitchenAbvGr)
hist(test$KitchenAbvGr)
ggplot(data.frame(train$KitchenQual)) + geom_bar(aes(x=train$KitchenQual))
ggplot(data.frame(test$KitchenQual)) + geom_bar(aes(x=test$KitchenQual))
hist(train$TotRmsAbvGrd)
hist(test$TotRmsAbvGrd)
ggplot(data.frame(train$Functional)) + geom_bar(aes(x=train$Functional))
ggplot(data.frame(test$Functional)) + geom_bar(aes(x=test$Functional))
hist(train$Fireplaces)
hist(test$Fireplaces)
ggplot(data.frame(train$FireplaceQu)) + geom_bar(aes(x=train$FireplaceQu))
ggplot(data.frame(test$FireplaceQu)) + geom_bar(aes(x=test$FireplaceQu))
ggplot(data.frame(train$GarageType)) + geom_bar(aes(x=train$GarageType))
ggplot(data.frame(test$GarageType)) + geom_bar(aes(x=test$GarageType))
hist(train$GarageYrBlt)
hist(test$GarageYrBlt)
ggplot(data.frame(train$GarageFinish)) + geom_bar(aes(x=train$GarageFinish))
ggplot(data.frame(test$GarageFinish)) + geom_bar(aes(x=test$GarageFinish))
hist(train$GarageArea)
hist(test$GarageArea)
mean(train$GarageArea)
## [1] 472.9801
mean(test$GarageArea)
## [1] NA
ggplot(data.frame(train$GarageQual)) + geom_bar(aes(x=train$GarageQual))
ggplot(data.frame(test$GarageQual)) + geom_bar(aes(x=test$GarageQual))
ggplot(data.frame(train$GarageCond)) + geom_bar(aes(x=train$GarageCond))
ggplot(data.frame(test$GarageCond)) + geom_bar(aes(x=test$GarageCond))
ggplot(data.frame(train$PavedDrive)) + geom_bar(aes(x=train$PavedDrive))
ggplot(data.frame(test$PavedDrive)) + geom_bar(aes(x=test$PavedDrive))
hist(train$WoodDeckSF)
hist(test$WoodDeckSF)
hist(train$OpenPorchSF)
hist(test$OpenPorchSF)
hist(train$EnclosedPorch)
hist(test$EnclosedPorch)
hist(train$X3SsnPorch)
hist(test$X3SsnPorch)
hist(train$ScreenPorch)
hist(test$ScreenPorch)
hist(train$PoolArea)
hist(test$PoolArea)
ggplot(data.frame(train$PoolQC)) + geom_bar(aes(x=train$PoolQC))
ggplot(data.frame(test$PoolQC)) + geom_bar(aes(x=test$PoolQC))
ggplot(data.frame(train$Fence)) + geom_bar(aes(x=train$Fence))
ggplot(data.frame(test$Fence)) + geom_bar(aes(x=test$Fence))
ggplot(data.frame(train$MiscFeature)) + geom_bar(aes(x=train$MiscFeature))
ggplot(data.frame(test$MiscFeature)) + geom_bar(aes(x=test$MiscFeature))
hist(train$MiscVal)
hist(test$MiscVal)
mean(train$MiscVal)
## [1] 43.48904
mean(test$MiscVal)
## [1] 58.16792
hist(train$MoSold)
hist(test$MoSold)
hist(train$YrSold)
hist(test$YrSold)
ggplot(data.frame(train$SaleType)) + geom_bar(aes(x=train$SaleType))
ggplot(data.frame(test$SaleType)) + geom_bar(aes(x=test$SaleType))
ggplot(data.frame(train$SaleCondition)) + geom_bar(aes(x=train$SaleCondition))
ggplot(data.frame(test$SaleCondition)) + geom_bar(aes(x=test$SaleCondition))
Dimensions of the dataset for both the train and test dataset.
#Get dimensions of the train dataset
#There are 1,460 rows
nrow(train)
## [1] 1460
#There are 81 columns
ncol(train)
## [1] 81
#Get dimensions of the test data
#There are 1,459 rows in the test data
nrow(test)
## [1] 1459
#There are 80 columns in the test data
ncol(test)
## [1] 80
Determine features/variables that are categorical and numerical in nature in the train dataset
categoricalTrain <- names(which(sapply(train, class) == "character"))
#43 features are categorical in nature
length(categoricalTrain)
## [1] 43
categoricalTrain
## [1] "MSZoning" "Street" "Alley" "LotShape"
## [5] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [9] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [13] "HouseStyle" "RoofStyle" "RoofMatl" "Exterior1st"
## [17] "Exterior2nd" "MasVnrType" "ExterQual" "ExterCond"
## [21] "Foundation" "BsmtQual" "BsmtCond" "BsmtExposure"
## [25] "BsmtFinType1" "BsmtFinType2" "Heating" "HeatingQC"
## [29] "CentralAir" "Electrical" "KitchenQual" "Functional"
## [33] "FireplaceQu" "GarageType" "GarageFinish" "GarageQual"
## [37] "GarageCond" "PavedDrive" "PoolQC" "Fence"
## [41] "MiscFeature" "SaleType" "SaleCondition"
print(" ")
## [1] " "
#Count the number of features that are numerical in the test dataset
numericalTrain <- names(which(sapply(train, class) != "character"))
#38 features are numerical in nature
length(numericalTrain)
## [1] 38
numericalTrain
## [1] "Id" "MSSubClass" "LotFrontage" "LotArea"
## [5] "OverallQual" "OverallCond" "YearBuilt" "YearRemodAdd"
## [9] "MasVnrArea" "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF"
## [13] "TotalBsmtSF" "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF"
## [17] "GrLivArea" "BsmtFullBath" "BsmtHalfBath" "FullBath"
## [21] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd"
## [25] "Fireplaces" "GarageYrBlt" "GarageCars" "GarageArea"
## [29] "WoodDeckSF" "OpenPorchSF" "EnclosedPorch" "X3SsnPorch"
## [33] "ScreenPorch" "PoolArea" "MiscVal" "MoSold"
## [37] "YrSold" "SalePrice"
Determine features/variables that are categorical and numerical in nature in the test dataset
categoricalTest <- names(which(sapply(test, class) == "character"))
#43 features are categorical in nature
length(categoricalTest)
## [1] 43
numericalTest <- names(which(sapply(test, class) != "character"))
#37 features are numerical in nature
length(numericalTest)
## [1] 37
#There is a discrepancy in the number of features between the test and train dataset
trainColumns <- colnames(train)
testColumns <- colnames(test)
#The result is the SalePrice attribute is missing in the test dataset but that is to be expected.
matchV <- match(trainColumns, testColumns)
head(train[is.na(matchV)])
It can be noted that the following categorical features are noted with “NA” but it does not necessarily mean that the feature has an invalid value due to error but it actually does not possess the physical feature.
print("Train dataset that have NA values")
## [1] "Train dataset that have NA values"
for (col in 1:ncol(train)) {
x<- colnames(train[col])
y<- sum(is.na(train[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "LotFrontage 259"
## [1] "Alley 1369"
## [1] "MasVnrType 8"
## [1] "MasVnrArea 8"
## [1] "BsmtQual 37"
## [1] "BsmtCond 37"
## [1] "BsmtExposure 38"
## [1] "BsmtFinType1 37"
## [1] "BsmtFinType2 38"
## [1] "Electrical 1"
## [1] "FireplaceQu 690"
## [1] "GarageType 81"
## [1] "GarageYrBlt 81"
## [1] "GarageFinish 81"
## [1] "GarageQual 81"
## [1] "GarageCond 81"
## [1] "PoolQC 1453"
## [1] "Fence 1179"
## [1] "MiscFeature 1406"
print("Test data that have NA values")
## [1] "Test data that have NA values"
for (col in 1:ncol(test)) {
x<- colnames(test[col])
y<- sum(is.na(test[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "MSZoning 4"
## [1] "LotFrontage 227"
## [1] "Alley 1352"
## [1] "Utilities 2"
## [1] "Exterior1st 1"
## [1] "Exterior2nd 1"
## [1] "MasVnrType 16"
## [1] "MasVnrArea 15"
## [1] "BsmtQual 44"
## [1] "BsmtCond 45"
## [1] "BsmtExposure 44"
## [1] "BsmtFinType1 42"
## [1] "BsmtFinSF1 1"
## [1] "BsmtFinType2 42"
## [1] "BsmtFinSF2 1"
## [1] "BsmtUnfSF 1"
## [1] "TotalBsmtSF 1"
## [1] "BsmtFullBath 2"
## [1] "BsmtHalfBath 2"
## [1] "KitchenQual 1"
## [1] "Functional 2"
## [1] "FireplaceQu 730"
## [1] "GarageType 76"
## [1] "GarageYrBlt 78"
## [1] "GarageFinish 78"
## [1] "GarageCars 1"
## [1] "GarageArea 1"
## [1] "GarageQual 78"
## [1] "GarageCond 78"
## [1] "PoolQC 1456"
## [1] "Fence 1169"
## [1] "MiscFeature 1408"
## [1] "SaleType 1"
To combat this issue, we will replace the “NA” in these to their actual value based on the data description text file.
train$Alley[is.na(train$Alley)] = "No alley access"
test$Alley[is.na(test$Alley)] = "No alley access"
train$BsmtQual[is.na(train$BsmtQual)] = "No Basement"
test$BsmtQual[is.na(test$BsmtQual)] = "No Basement"
train$BsmtCond[is.na(train$BsmtCond)] = "No Basement"
test$BsmtCond[is.na(test$BsmtCond)] = "No Basement"
train$BsmtExposure[is.na(train$BsmtExposure)] = "No Basement"
test$BsmtExposure[is.na(test$BsmtExposure)] = "No Basement"
train$BsmtFinType1[is.na(train$BsmtFinType1)] = "No Basement"
test$BsmtFinType1[is.na(test$BsmtFinType1)] = "No Basement"
train$BsmtFinType2[is.na(train$BsmtFinType2)] = "No Basement"
test$BsmtFinType2[is.na(test$BsmtFinType2)] = "No Basement"
train$FireplaceQu[is.na(train$FireplaceQu)] = "No Fireplace"
test$FireplaceQu[is.na(test$FireplaceQu)] = "No Fireplace"
train$GarageType[is.na(train$GarageType)] = "No Garage"
test$GarageType[is.na(test$GarageType)] = "No Garage"
train$GarageFinish[is.na(train$GarageFinish)] = "No Garage"
test$GarageFinish[is.na(test$GarageFinish)] = "No Garage"
train$GarageQual[is.na(train$GarageQual)] = "No Garage"
test$GarageQual[is.na(test$GarageQual)] = "No Garage"
train$GarageCond[is.na(train$GarageCond)] = "No Garage"
test$GarageCond[is.na(test$GarageCond)] = "No Garage"
train$PoolQC[is.na(train$PoolQC)] = "No Pool"
test$PoolQC[is.na(test$PoolQC)] = "No Pool"
train$Fence[is.na(train$Fence)] = "No Fence"
test$Fence[is.na(test$Fence)] = "No Fence"
train$MiscFeature[is.na(train$MiscFeature)] = "None"
test$MiscFeature[is.na(test$MiscFeature)] = "None"
With the changed values, the “NA” values remaining are numerical or categorical values that are not detailed in the data description text file.
print("Train Features that have NA")
## [1] "Train Features that have NA"
for (col in 1:ncol(train)) {
x<- colnames(train[col])
y<- sum(is.na(train[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "LotFrontage 259"
## [1] "MasVnrType 8"
## [1] "MasVnrArea 8"
## [1] "Electrical 1"
## [1] "GarageYrBlt 81"
print("Test Features that have NA")
## [1] "Test Features that have NA"
for (col in 1:ncol(test)) {
x<- colnames(test[col])
y<- sum(is.na(test[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "MSZoning 4"
## [1] "LotFrontage 227"
## [1] "Utilities 2"
## [1] "Exterior1st 1"
## [1] "Exterior2nd 1"
## [1] "MasVnrType 16"
## [1] "MasVnrArea 15"
## [1] "BsmtFinSF1 1"
## [1] "BsmtFinSF2 1"
## [1] "BsmtUnfSF 1"
## [1] "TotalBsmtSF 1"
## [1] "BsmtFullBath 2"
## [1] "BsmtHalfBath 2"
## [1] "KitchenQual 1"
## [1] "Functional 2"
## [1] "GarageYrBlt 78"
## [1] "GarageCars 1"
## [1] "GarageArea 1"
## [1] "SaleType 1"
These following categorical variables still have NA values.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
categoricalTestNA <- test %>% select_if(is.character)
print("Test Categorical Features")
## [1] "Test Categorical Features"
for (col in 1:ncol(categoricalTestNA)) {
x<- colnames(categoricalTestNA[col])
y<- sum(is.na(categoricalTestNA[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "MSZoning 4"
## [1] "Utilities 2"
## [1] "Exterior1st 1"
## [1] "Exterior2nd 1"
## [1] "MasVnrType 16"
## [1] "KitchenQual 1"
## [1] "Functional 2"
## [1] "SaleType 1"
categoricalTrainNA <- train %>% select_if(is.character)
print("Train Categorical Features")
## [1] "Train Categorical Features"
for (col in 1:ncol(categoricalTrainNA)) {
x<- colnames(categoricalTrainNA[col])
y<- sum(is.na(categoricalTrainNA[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "MasVnrType 8"
## [1] "Electrical 1"
For the following categorical features that have NA, replace with the mode value.
#mode function
Mode <- function(x) {
ux <- unique(x)
ux[which.max((tabulate(match(x, ux))))]
}
train$MasVnrType[is.na(train$MasVnrType)] = Mode(train$MasVnrType)
train$Electrical[is.na(train$Electrical)] = Mode(train$Electrical)
test$MSZoning[is.na(test$MSZoning)] = Mode(test$MSZoning)
test$Utilities[is.na(test$Utilities)] = Mode(test$Utilities)
test$Exterior1st[is.na(test$Exterior1st)] = Mode(test$Exterior1st)
test$Exterior2nd[is.na(test$Exterior2nd)] = Mode(test$Exterior2nd)
test$MasVnrType[is.na(test$MasVnrType)] = Mode(test$MasVnrType)
test$KitchenQual[is.na(test$KitchenQual)] = Mode(test$KitchenQual)
test$Functional[is.na(test$Functional)] = Mode(test$Functional)
test$SaleType[is.na(test$SaleType)] = Mode(test$SaleType)
For the numerical features, it seems plausible that a “NA” dictate that it is in fact 0. This links directly with the fact that some houses may not have a basement, alley, garage, fence, and etc.
#These are the numerical features left with NAs
numericalTestNA <- test %>% select_if(is.numeric)
print("Test Numerical Features")
## [1] "Test Numerical Features"
for (col in 1:ncol(numericalTestNA)) {
x<- colnames(numericalTestNA[col])
y<- sum(is.na(numericalTestNA[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "LotFrontage 227"
## [1] "MasVnrArea 15"
## [1] "BsmtFinSF1 1"
## [1] "BsmtFinSF2 1"
## [1] "BsmtUnfSF 1"
## [1] "TotalBsmtSF 1"
## [1] "BsmtFullBath 2"
## [1] "BsmtHalfBath 2"
## [1] "GarageYrBlt 78"
## [1] "GarageCars 1"
## [1] "GarageArea 1"
numericalTrainNA <- train %>% select_if(is.numeric)
print("Train Numerical Features")
## [1] "Train Numerical Features"
for (col in 1:ncol(numericalTrainNA)) {
x<- colnames(numericalTrainNA[col])
y<- sum(is.na(numericalTrainNA[col]))
if (y > 0) {
print(paste(x,y))
}
}
## [1] "LotFrontage 259"
## [1] "MasVnrArea 8"
## [1] "GarageYrBlt 81"
To address this issue, we will assign 0 to the features that contain “Bsmt” and “Garage”.
test$BsmtFinSF1[is.na(test$BsmtFinSF1)] = 0
test$BsmtFinSF2[is.na(test$BsmtFinSF2)] = 0
test$BsmtUnfSF[is.na(test$BsmtUnfSF)] = 0
test$TotalBsmtSF[is.na(test$TotalBsmtSF)] = 0
test$BsmtFullBath[is.na(test$BsmtFullBath)] = 0
test$BsmtHalfBath[is.na(test$BsmtHalfBath)] = 0
test$GarageCars[is.na(test$GarageCars)] = 0
test$GarageArea[is.na(test$GarageArea)] = 0
Secondly, for the GarageYrBlt and MasVnrArea, putting a 0 wouldn’t make sense. There are 78 rows that do not have a year assigned that could actually have a garage and the for MasVnrArea, there is some areas just not being report for the masonry veneer area because info was lacking. Instead of removing the rows completely (effectively removing a minimum of 78 rows and up to a maximum of 86 rows), we can assign a value of -1 that assign a non-logical value and can help identify that these are outliers.
test$GarageYrBlt[is.na(test$GarageYrBlt)] = -1
test$MasVnrArea[is.na(test$MasVnrArea)] = -1
train$MasVnrArea[is.na(train$MasVnrArea)] = -1
train$GarageYrBlt[is.na(train$GarageYrBlt)] = -1
For the case of the missing LotFrontage area, we can assume there are outliers present and so a median approach can be used for these missing values
test$LotFrontage[is.na(test$LotFrontage)] = median(test$LotFrontage, na.rm = TRUE)
train$LotFrontage[is.na(train$LotFrontage)] = median(train$LotFrontage, na.rm = TRUE)
Corrplot to see what is plotted in relevance
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(tidyverse)
library(dplyr)
numValues <- select_if(train, is.numeric)
trainCor <- cor(numValues)
corrplot(trainCor, method = "shade")
threshold <- 0.5
cor_filter <- trainCor
diag(cor_filter) <- 0
filter <- apply(cor_filter,1, function(x) sum(abs(x) >= threshold))
sel <- filter
cor_final <- cor_filter[sel, sel]
corrplot(cor_final, method = "color")
For the purposes of forming a linear regression model, the categorical attributes most be assigned a number value instead of a character value.
#Perform all the changes on a second duplicate data frame
modifiedTest <- test
modifiedTrain <- train
#Modifying the values to the number values; they are assigned alphabetically
modifiedTest$MSZoning <- as.numeric(factor(modifiedTest$MSZoning))
modifiedTest$Street <- as.numeric(factor(modifiedTest$Street))
modifiedTest$Alley <- as.numeric(factor(modifiedTest$Alley))
modifiedTest$LotShape <- as.numeric(factor(modifiedTest$LotShape))
modifiedTest$LandContour <- as.numeric(factor(modifiedTest$LandContour))
modifiedTest$Utilities <- as.numeric(factor(modifiedTest$Utilities))
modifiedTest$LotConfig <- as.numeric(factor(modifiedTest$LotConfig))
modifiedTest$LandSlope <- as.numeric(factor(modifiedTest$LandSlope))
modifiedTest$Neighborhood <- as.numeric(factor(modifiedTest$Neighborhood))
modifiedTest$Condition1 <- as.numeric(factor(modifiedTest$Condition1))
modifiedTest$Condition2 <- as.numeric(factor(modifiedTest$Condition2))
modifiedTest$BldgType <- as.numeric(factor(modifiedTest$BldgType))
modifiedTest$HouseStyle <- as.numeric(factor(modifiedTest$HouseStyle))
modifiedTest$RoofStyle <- as.numeric(factor(modifiedTest$RoofStyle))
modifiedTest$RoofMatl <- as.numeric(factor(modifiedTest$RoofMatl))
modifiedTest$Exterior1st <- as.numeric(factor(modifiedTest$Exterior1st))
modifiedTest$Exterior2nd <- as.numeric(factor(modifiedTest$Exterior2nd))
modifiedTest$MasVnrType <- as.numeric(factor(modifiedTest$MasVnrType))
modifiedTest$ExterQual <- as.numeric(factor(modifiedTest$ExterQual))
modifiedTest$ExterCond <- as.numeric(factor(modifiedTest$ExterCond))
modifiedTest$Foundation <- as.numeric(factor(modifiedTest$Foundation))
modifiedTest$BsmtQual <- as.numeric(factor(modifiedTest$BsmtQual))
modifiedTest$BsmtCond <- as.numeric(factor(modifiedTest$BsmtCond))
modifiedTest$BsmtExposure <- as.numeric(factor(modifiedTest$BsmtExposure))
modifiedTest$BsmtFinType1 <- as.numeric(factor(modifiedTest$BsmtFinType1))
modifiedTest$BsmtFinType2 <- as.numeric(factor(modifiedTest$BsmtFinType2))
modifiedTest$Heating <- as.numeric(factor(modifiedTest$Heating))
modifiedTest$HeatingQC <- as.numeric(factor(modifiedTest$HeatingQC))
modifiedTest$CentralAir <- as.numeric(factor(modifiedTest$CentralAir))
modifiedTest$Electrical <- as.numeric(factor(modifiedTest$Electrical))
modifiedTest$KitchenQual <- as.numeric(factor(modifiedTest$KitchenQual))
modifiedTest$Functional <- as.numeric(factor(modifiedTest$Functional))
modifiedTest$FireplaceQu <- as.numeric(factor(modifiedTest$FireplaceQu))
modifiedTest$GarageType <- as.numeric(factor(modifiedTest$GarageType))
modifiedTest$GarageFinish <- as.numeric(factor(modifiedTest$GarageFinish))
modifiedTest$GarageQual <- as.numeric(factor(modifiedTest$GarageQual))
modifiedTest$GarageCond <- as.numeric(factor(modifiedTest$GarageCond))
modifiedTest$PavedDrive <- as.numeric(factor(modifiedTest$PavedDrive))
modifiedTest$PoolQC <- as.numeric(factor(modifiedTest$PoolQC))
modifiedTest$Fence <- as.numeric(factor(modifiedTest$Fence))
modifiedTest$MiscFeature <- as.numeric(factor(modifiedTest$MiscFeature))
modifiedTest$SaleType <- as.numeric(factor(modifiedTest$SaleType))
modifiedTest$SaleCondition <- as.numeric(factor(modifiedTest$SaleCondition))
modifiedTrain$MSZoning <- as.numeric(factor(modifiedTrain$MSZoning))
modifiedTrain$Street <- as.numeric(factor(modifiedTrain$Street))
modifiedTrain$Alley <- as.numeric(factor(modifiedTrain$Alley))
modifiedTrain$LotShape <- as.numeric(factor(modifiedTrain$LotShape))
modifiedTrain$LandContour <- as.numeric(factor(modifiedTrain$LandContour))
modifiedTrain$Utilities <- as.numeric(factor(modifiedTrain$Utilities))
modifiedTrain$LotConfig <- as.numeric(factor(modifiedTrain$LotConfig))
modifiedTrain$LandSlope <- as.numeric(factor(modifiedTrain$LandSlope))
modifiedTrain$Neighborhood <- as.numeric(factor(modifiedTrain$Neighborhood))
modifiedTrain$Condition1 <- as.numeric(factor(modifiedTrain$Condition1))
modifiedTrain$Condition2 <- as.numeric(factor(modifiedTrain$Condition2))
modifiedTrain$BldgType <- as.numeric(factor(modifiedTrain$BldgType))
modifiedTrain$HouseStyle <- as.numeric(factor(modifiedTrain$HouseStyle))
modifiedTrain$RoofStyle <- as.numeric(factor(modifiedTrain$RoofStyle))
modifiedTrain$RoofMatl <- as.numeric(factor(modifiedTrain$RoofMatl))
modifiedTrain$Exterior1st <- as.numeric(factor(modifiedTrain$Exterior1st))
modifiedTrain$Exterior2nd <- as.numeric(factor(modifiedTrain$Exterior2nd))
modifiedTrain$MasVnrType <- as.numeric(factor(modifiedTrain$MasVnrType))
modifiedTrain$ExterQual <- as.numeric(factor(modifiedTrain$ExterQual))
modifiedTrain$ExterCond <- as.numeric(factor(modifiedTrain$ExterCond))
modifiedTrain$Foundation <- as.numeric(factor(modifiedTrain$Foundation))
modifiedTrain$BsmtQual <- as.numeric(factor(modifiedTrain$BsmtQual))
modifiedTrain$BsmtCond <- as.numeric(factor(modifiedTrain$BsmtCond))
modifiedTrain$BsmtExposure <- as.numeric(factor(modifiedTrain$BsmtExposure))
modifiedTrain$BsmtFinType1 <- as.numeric(factor(modifiedTrain$BsmtFinType1))
modifiedTrain$BsmtFinType2 <- as.numeric(factor(modifiedTrain$BsmtFinType2))
modifiedTrain$Heating <- as.numeric(factor(modifiedTrain$Heating))
modifiedTrain$HeatingQC <- as.numeric(factor(modifiedTrain$HeatingQC))
modifiedTrain$CentralAir <- as.numeric(factor(modifiedTrain$CentralAir))
modifiedTrain$Electrical <- as.numeric(factor(modifiedTrain$Electrical))
modifiedTrain$KitchenQual <- as.numeric(factor(modifiedTrain$KitchenQual))
modifiedTrain$Functional <- as.numeric(factor(modifiedTrain$Functional))
modifiedTrain$FireplaceQu <- as.numeric(factor(modifiedTrain$FireplaceQu))
modifiedTrain$GarageType <- as.numeric(factor(modifiedTrain$GarageType))
modifiedTrain$GarageFinish <- as.numeric(factor(modifiedTrain$GarageFinish))
modifiedTrain$GarageQual <- as.numeric(factor(modifiedTrain$GarageQual))
modifiedTrain$GarageCond <- as.numeric(factor(modifiedTrain$GarageCond))
modifiedTrain$PavedDrive <- as.numeric(factor(modifiedTrain$PavedDrive))
modifiedTrain$PoolQC <- as.numeric(factor(modifiedTrain$PoolQC))
modifiedTrain$Fence <- as.numeric(factor(modifiedTrain$Fence))
modifiedTrain$MiscFeature <- as.numeric(factor(modifiedTrain$MiscFeature))
modifiedTrain$SaleType <- as.numeric(factor(modifiedTrain$SaleType))
modifiedTrain$SaleCondition <- as.numeric(factor(modifiedTrain$SaleCondition))
For feature selection, we will use the Random Forest based on the Boruta algorithm.
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(100)
x <- modifiedTrain[1:1000,]
y<- modifiedTrain[1001:1451,]
rfImp <- randomForest(SalePrice ~., data = x, ntree = 500, importance = TRUE, na.action = na.roughfix)
importance(rfImp)
## %IncMSE IncNodePurity
## Id -0.33724662 2.564736e+10
## MSSubClass 9.40657763 1.905665e+10
## MSZoning 8.98869457 1.024157e+10
## LotFrontage 5.19909463 5.041257e+10
## LotArea 11.06339283 1.295530e+11
## Street -1.00100150 1.106835e+08
## Alley 3.02263337 1.290483e+09
## LotShape 2.58967033 6.885532e+09
## LandContour 0.13054549 9.691593e+09
## Utilities 0.00000000 1.963291e+06
## LotConfig 0.67618849 5.361574e+09
## LandSlope 2.98896791 8.458013e+09
## Neighborhood 9.33074734 4.003759e+10
## Condition1 1.02740124 3.889137e+09
## Condition2 -1.42452578 8.647074e+08
## BldgType 4.51208201 4.663291e+09
## HouseStyle 3.46535898 7.495093e+09
## OverallQual 23.40366716 1.541388e+12
## OverallCond 6.09570086 2.330339e+10
## YearBuilt 14.98798572 3.290240e+11
## YearRemodAdd 7.19150012 6.942981e+10
## RoofStyle 2.36801327 8.304531e+09
## RoofMatl -2.30948584 1.203275e+10
## Exterior1st 5.42065871 1.551306e+10
## Exterior2nd 3.96007362 1.279377e+10
## MasVnrType 2.92117368 6.331765e+09
## MasVnrArea 5.80526822 7.276429e+10
## ExterQual 8.74813658 2.457379e+11
## ExterCond 1.24140494 3.964406e+09
## Foundation 3.94061910 6.122764e+09
## BsmtQual 6.25793799 1.335838e+11
## BsmtCond 0.55267747 2.386854e+09
## BsmtExposure 1.56410906 1.271892e+10
## BsmtFinType1 7.95466107 1.709935e+10
## BsmtFinSF1 6.95421001 1.512946e+11
## BsmtFinType2 1.64515276 3.596004e+09
## BsmtFinSF2 2.51958984 5.504324e+09
## BsmtUnfSF 5.06491772 3.479275e+10
## TotalBsmtSF 15.15309387 2.950302e+11
## Heating 0.17066201 1.404862e+09
## HeatingQC 5.34209808 5.924930e+09
## CentralAir 3.92472315 7.690464e+09
## Electrical -0.36386615 1.602042e+09
## X1stFlrSF 13.82739428 2.556271e+11
## X2ndFlrSF 10.20407855 1.173135e+11
## LowQualFinSF -1.35605770 1.395875e+09
## GrLivArea 33.05018382 9.188650e+11
## BsmtFullBath 3.22583630 1.261374e+10
## BsmtHalfBath 3.10975364 1.113360e+10
## FullBath 9.98561570 1.061800e+11
## HalfBath 7.47745831 1.021306e+10
## BedroomAbvGr 4.92004896 1.445991e+10
## KitchenAbvGr 5.41672243 9.237209e+09
## KitchenQual 6.00693127 9.629769e+10
## TotRmsAbvGrd 5.11237367 8.159518e+10
## Functional 1.91323527 3.539223e+09
## Fireplaces 10.45799148 4.703972e+10
## FireplaceQu 6.36361050 1.225761e+10
## GarageType 10.36822854 4.334701e+10
## GarageYrBlt 8.99796576 1.166383e+11
## GarageFinish 6.99874961 1.305272e+10
## GarageCars 12.39968619 6.834243e+11
## GarageArea 12.24536826 2.542581e+11
## GarageQual 1.62335910 5.723552e+09
## GarageCond 2.79419399 2.875112e+09
## PavedDrive 4.97894854 6.382699e+09
## WoodDeckSF 2.81724635 3.129951e+10
## OpenPorchSF 4.81242923 3.248856e+10
## EnclosedPorch 0.21952135 5.960823e+09
## X3SsnPorch 0.62408749 4.570850e+08
## ScreenPorch 0.02128592 1.458320e+10
## PoolArea 0.00000000 1.086433e+08
## PoolQC 0.00000000 2.340141e+07
## Fence 0.94509165 2.306340e+09
## MiscFeature -2.01726723 4.423697e+08
## MiscVal -1.56387341 7.444683e+08
## MoSold 0.81551435 3.563767e+10
## YrSold 0.14567719 1.175499e+10
## SaleType 2.39047908 7.388047e+09
## SaleCondition 1.86038809 1.105829e+10
varImpPlot(rfImp, n.var = 10)
varImp(rfImp)
Normalize the data. The Utilities attribute for the train dataset all contain 0 – it is not possible to normalize that column if they all have the same value.
normalize <- function(x) {
return ((x-min(x)) / (max(x)-min(x)))
}
#Utilities was excluded as it can't be normalized since it has one-sided results in test
testNorm <- apply(modifiedTest[, c(2:9, 11:80)],2,function(x) (x-min(x)/(max(x)-min(x))))
trainNorm <- apply(modifiedTrain[, 2:81],2,function(x) (x-min(x)/(max(x)-min(x))))
Linear training model with K-fold cross-validation of 10-folds
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.3
model_train <- lm(SalePrice ~ ., data = data.frame(trainNorm))
model_train
##
## Call:
## lm(formula = SalePrice ~ ., data = data.frame(trainNorm))
##
## Coefficients:
## (Intercept) MSSubClass MSZoning LotFrontage LotArea
## 1.338e+06 -1.125e+02 -1.285e+03 -1.605e+02 4.145e-01
## Street Alley LotShape LandContour Utilities
## 3.126e+04 2.676e+03 -8.757e+02 3.150e+03 -5.410e+04
## LotConfig LandSlope Neighborhood Condition1 Condition2
## -7.691e+00 5.267e+03 2.548e+02 -8.847e+02 -9.228e+03
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## -2.868e+03 -1.078e+03 1.107e+04 5.338e+03 2.086e+02
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## -2.080e+01 2.090e+03 5.191e+03 -1.008e+03 3.864e+02
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 4.598e+03 3.346e+01 -1.008e+04 8.470e+02 8.812e+02
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## -4.230e+03 1.323e+03 -3.482e+03 -6.409e+02 8.738e+00
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 7.205e+02 1.138e+01 3.662e-01 NA -2.269e+03
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## -6.235e+02 7.358e+02 -5.148e+02 4.786e+01 4.324e+01
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## -1.769e+00 NA 6.341e+03 -4.110e+02 2.848e+03
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## -2.931e+02 -3.868e+03 -1.264e+04 -8.397e+03 3.853e+03
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 3.742e+03 4.997e+03 -1.550e+03 3.420e+02 -8.287e+00
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## -8.922e+02 1.379e+04 9.837e-02 -9.608e+02 9.871e+02
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 2.544e+03 2.215e+01 -2.948e+00 -1.275e+00 3.003e+01
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 4.602e+01 -2.826e+02 -8.800e+04 7.638e+01 -3.059e+03
## MiscVal MoSold YrSold SaleType SaleCondition
## 8.721e-02 -1.710e+02 -9.053e+02 -5.985e+02 3.180e+03
set.seed(123)
train.control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
# Train the model
model <- train(SalePrice ~., data = trainNorm, method = "lm",
trControl = train.control)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
# Summarize the results
print(model)
## Linear Regression
##
## 1460 samples
## 79 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1312, 1313, 1315, 1316, 1314, 1315, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 36126.82 0.7994177 20498.66
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
Plot of linear regression with with all attributes.
predicted_pricesLM <- predict(model, newdata = modifiedTrain)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
plot(predicted_pricesLM ,modifiedTrain$SalePrice,
xlab="Predicted Prices",ylab="Actual Prices", main = "Linear Regression - All Attributes")
abline(a=0,b=1)
Revised version with reduced feature selection that was selected via Random Forest
model_train <- lm(SalePrice ~ OverallQual+GrLivArea+GarageCars+YearBuilt+TotalBsmtSF+X1stFlrSF+GarageArea+ExterQual+BsmtFinSF1+BsmtQual, data = data.frame(trainNorm))
model_train
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## YearBuilt + TotalBsmtSF + X1stFlrSF + GarageArea + ExterQual +
## BsmtFinSF1 + BsmtQual, data = data.frame(trainNorm))
##
## Coefficients:
## (Intercept) OverallQual GrLivArea GarageCars YearBuilt TotalBsmtSF
## -2.308e+05 1.691e+04 4.639e+01 1.176e+04 1.292e+02 8.025e+00
## X1stFlrSF GarageArea ExterQual BsmtFinSF1 BsmtQual
## 1.137e+01 4.942e+00 -1.543e+04 2.079e+01 -4.999e+03
set.seed(123)
train.control <- trainControl(method = "repeatedcv",
number = 10, repeats = 3)
# Train the model
model <- train(SalePrice ~ OverallQual+GrLivArea+GarageCars+YearBuilt+TotalBsmtSF+X1stFlrSF+GarageArea+ExterQual+BsmtFinSF1+BsmtQual, data = trainNorm, method = "lm",
trControl = train.control)
# Summarize the results
print(model)
## Linear Regression
##
## 1460 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1312, 1313, 1315, 1316, 1314, 1315, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 36210.37 0.7973232 22593.49
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -532092 -17106 -564 15012 260160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.308e+05 8.968e+04 -2.574 0.01016 *
## OverallQual 1.691e+04 1.177e+03 14.375 < 2e-16 ***
## GrLivArea 4.638e+01 2.578e+00 17.992 < 2e-16 ***
## GarageCars 1.176e+04 2.900e+03 4.055 5.28e-05 ***
## YearBuilt 1.292e+02 4.542e+01 2.845 0.00451 **
## TotalBsmtSF 8.025e+00 4.234e+00 1.895 0.05824 .
## X1stFlrSF 1.137e+01 4.686e+00 2.426 0.01537 *
## GarageArea 4.942e+00 9.823e+00 0.503 0.61497
## ExterQual -1.543e+04 1.886e+03 -8.182 6.06e-16 ***
## BsmtFinSF1 2.079e+01 2.468e+00 8.425 < 2e-16 ***
## BsmtQual -4.999e+03 1.053e+03 -4.746 2.28e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36040 on 1449 degrees of freedom
## Multiple R-squared: 0.7956, Adjusted R-squared: 0.7942
## F-statistic: 564.1 on 10 and 1449 DF, p-value: < 2.2e-16
Predict some using the linear model of selected features.
predicted_pricesTrain <- predict(model, newdata = modifiedTrain)
head(predicted_pricesTrain)
## 1 2 3 4 5 6
## 221934.2 178414.8 222164.1 187881.6 278822.1 154281.5
predicted_prices <- predict(model, newdata = modifiedTest)
head(predicted_prices)
## 1 2 3 4 5 6
## 104347.4 156864.3 170980.1 172836.1 216285.1 168726.6
Plot of linear regression with with random-forest selected features.
plot(predicted_pricesTrain ,modifiedTrain$SalePrice,
xlab="Predicted Prices",ylab="Actual Prices", main = "Linear Regression with Random-Forest")
abline(a=0,b=1)
Partition Data
set.seed(100)
index = sample(1:nrow(modifiedTrain), 0.7*nrow(modifiedTrain))
train1 = modifiedTrain[index, ] #training
test1 = modifiedTrain[-index, ] #test data
dim(train1)
## [1] 1021 81
dim(test1)
## [1] 439 81
Scale numeric
library(glmnet)
## Warning: package 'glmnet' was built under R version 4.0.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.0-2
cols = c("OverallQual", "GrLivArea", "GarageCars","YearBuilt","TotalBsmtSF", "X1stFlrSF", "GarageArea", "ExterQual", "BsmtFinSF1", "BsmtQual")
pre_proc_val <- preProcess(modifiedTrain[,cols], method = c("center", "scale"))
train1[,cols] = predict(pre_proc_val, train1[,cols])
test1[,cols] = predict(pre_proc_val, test1[,cols])
Regularize coefficients
cols_reg = c("OverallQual", "GrLivArea", "GarageCars","YearBuilt","TotalBsmtSF", "X1stFlrSF", "GarageArea", "ExterQual", "BsmtFinSF1", "BsmtQual", "SalePrice")
dummies <- dummyVars(SalePrice ~ ., data = modifiedTrain[,cols_reg])
train_dummies = predict(dummies, newdata = train1[,cols_reg])
test_dummies = predict(dummies, newdata = test1[,cols_reg])
print(dim(train_dummies)); print(dim(test_dummies))
## [1] 1021 10
## [1] 439 10
Develop regularized lasso regression to find best lambda
x <- as.matrix(train_dummies)
y_train <- train1$SalePrice
x_test <- as.matrix(test_dummies)
y_test <- test1$SalePrice
lambdas <- 10^seq(2, -3, by = -0.1)
lasso_reg <- cv.glmnet(x, y_train, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
lambda_best <- lasso_reg$lambda.min
lambda_best
## [1] 100
Model lasso regression
lasso_model <- glmnet(x, y_train, alpha = 1, lambda = lambda_best, standardize = TRUE)
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
# Model performance metrics
data.frame(
RMSE = RMSE,
Rsquare = R_square
)
}
coef(lasso_model)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 181573.1438
## OverallQual 25470.8587
## GrLivArea 25464.2439
## GarageCars 10445.4066
## YearBuilt 2778.1501
## TotalBsmtSF 245.1876
## X1stFlrSF 5213.0791
## GarageArea .
## ExterQual -9925.5324
## BsmtFinSF1 9668.4035
## BsmtQual -6005.1565
predictions_train <- predict(lasso_model, s = lambda_best, newx = x)
eval_results(y_train, predictions_train, modifiedTrain)
predictions_test <- predict(lasso_model, s = lambda_best, newx = x_test)
eval_results(y_test, predictions_test, modifiedTest)
Plot of lasso regression with with random-forest selected features.
plot(predictions_train ,train1$SalePrice,
xlab="Predicted Prices",ylab="Actual Prices", main = "Lasso Regression with Random-Forest")
abline(a=0,b=1)
Make lasso regression letting its own algorithm determine features/attributes.
set.seed(100)
index = sample(1:nrow(modifiedTrain), 0.7*nrow(modifiedTrain))
train2 = modifiedTrain[index, ] #training
test2 = modifiedTrain[-index, ] #test data
pre_proc_val1 <- preProcess(modifiedTrain, method = c("center", "scale"))
train2 = predict(pre_proc_val, train2)
test2 = predict(pre_proc_val1, test2)
dummies <- dummyVars(SalePrice ~ ., data = modifiedTrain)
train_dummies1 = predict(dummies, newdata = train2)
test_dummies1 = predict(dummies, newdata = test2)
x <- as.matrix(train_dummies1)
y_train <- train2$SalePrice
x_test <- as.matrix(test_dummies1)
y_test <- test2$SalePrice
lambdas <- 10^seq(2, -3, by = -0.1)
lasso_reg <- cv.glmnet(x, y_train, alpha = 1, lambda = lambdas, standardize = TRUE, nfolds = 5)
lambda_best <- lasso_reg$lambda.min
lambda_best
## [1] 100
lasso_model <- glmnet(x, y_train, alpha = 1, lambda = lambda_best, standardize = TRUE)
eval_results <- function(true, predicted, df) {
SSE <- sum((predicted - true)^2)
SST <- sum((true - mean(true))^2)
R_square <- 1 - SSE / SST
RMSE = sqrt(SSE/nrow(df))
# Model performance metrics
data.frame(
RMSE = RMSE,
Rsquare = R_square
)
}
coef(lasso_model)
## 81 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.254367e+06
## Id -5.780810e-01
## MSSubClass -1.310120e+02
## MSZoning -1.546349e+03
## LotFrontage -1.120779e+02
## LotArea 5.199583e-01
## Street 3.725504e+04
## Alley 1.699418e+03
## LotShape -7.913624e+02
## LandContour 1.314962e+02
## Utilities -4.920764e+04
## LotConfig .
## LandSlope 1.547381e+03
## Neighborhood 1.228866e+02
## Condition1 6.656055e+01
## Condition2 -2.651524e+03
## BldgType -1.599719e+03
## HouseStyle -6.546593e+02
## OverallQual 1.618188e+04
## OverallCond 4.157100e+03
## YearBuilt 3.031376e+03
## YearRemodAdd -1.051323e+01
## RoofStyle 3.415164e+03
## RoofMatl 1.466345e+02
## Exterior1st -1.119925e+03
## Exterior2nd 1.061778e+02
## MasVnrType 4.352602e+03
## MasVnrArea 2.899631e+01
## ExterQual -6.089289e+03
## ExterCond 1.036432e+03
## Foundation 6.866980e+02
## BsmtQual -5.201855e+03
## BsmtCond 3.788160e+02
## BsmtExposure -3.393524e+03
## BsmtFinType1 -6.984326e+02
## BsmtFinSF1 4.904043e+03
## BsmtFinType2 4.720977e+02
## BsmtFinSF2 1.014427e+01
## BsmtUnfSF -9.173713e-01
## TotalBsmtSF .
## Heating -2.631787e+03
## HeatingQC -1.477152e+03
## CentralAir 9.262570e+02
## Electrical -2.020612e+02
## X1stFlrSF 2.621791e+03
## X2ndFlrSF .
## LowQualFinSF -5.772143e+01
## GrLivArea 2.360197e+04
## BsmtFullBath 6.506220e+03
## BsmtHalfBath -3.093603e+03
## FullBath 4.196272e+03
## HalfBath 1.866377e+03
## BedroomAbvGr -5.424431e+03
## KitchenAbvGr -1.602417e+04
## KitchenQual -7.586985e+03
## TotRmsAbvGrd 4.296537e+03
## Functional 5.454336e+03
## Fireplaces 3.426877e+03
## FireplaceQu -8.730977e+02
## GarageType 6.435110e+02
## GarageYrBlt -7.301378e+00
## GarageFinish -8.454088e+02
## GarageCars 9.386543e+03
## GarageArea 6.206339e+02
## GarageQual -5.972414e+02
## GarageCond 7.553067e+02
## PavedDrive 2.581035e+03
## WoodDeckSF 2.243428e+01
## OpenPorchSF 2.197549e+01
## EnclosedPorch -4.192071e+01
## X3SsnPorch 9.838724e+00
## ScreenPorch 4.627182e+01
## PoolArea -1.065078e+03
## PoolQC -2.490499e+05
## Fence .
## MiscFeature -3.243666e+03
## MiscVal -1.424219e-01
## MoSold -3.793662e+01
## YrSold -5.456195e+02
## SaleType -8.879488e+02
## SaleCondition 3.325754e+03
predictions_train <- predict(lasso_model, s = lambda_best, newx = x)
eval_results(y_train, predictions_train, modifiedTrain)
predictions_test <- predict(lasso_model, s = lambda_best, newx = x_test)
eval_results(y_test, predictions_test, modifiedTest)
Plot of lasso regression with all attributes.
plot(predictions_train ,train2$SalePrice,
xlab="Predicted Prices",ylab="Actual Prices", main = "Lasso Regression - All Attributes")
abline(a=0,b=1)